Data Backfill

The goal of the backfill process is to pull the historical data for the required series using the settings.json file. This includes the following steps:

Load Libraries

Code
import eia_api as api
import eia_data 
import pandas as pd
import numpy as np
import requests
import json
import os
import datetime
import plotly.express as px
from ydata_profiling import ProfileReport
Code
raw_json = open("../settings/series.json")
meta_json = json.load(raw_json)
series = pd.DataFrame(meta_json["series"])
api_path = meta_json["api_path"]
Code
facets_template = {
  "parent" : None,
  "subba" : None
}

start = datetime.datetime(meta_json["start"]["year"],
 meta_json["start"]["month"], 
 meta_json["start"]["day"], 
 meta_json["start"]["hour"])
end = datetime.datetime(meta_json["end"]["year"],
 meta_json["end"]["month"], 
 meta_json["end"]["day"], 
 meta_json["end"]["hour"])

offset = 2250

eia_api_key = os.getenv('EIA_API_KEY')

meta_path = meta_json["meta_path"]
data_path = meta_json["data_path"]
Code
metadata = api.eia_metadata(api_key = eia_api_key, api_path = api_path)
print(metadata.meta.keys())
print(metadata.meta["startPeriod"])
print(metadata.meta["endPeriod"])
dict_keys(['id', 'name', 'description', 'frequency', 'facets', 'data', 'startPeriod', 'endPeriod', 'defaultDateFormat', 'defaultFrequency'])
2018-06-19T05
2024-06-22T07
Code
for i in series.index:
  facets = facets_template
  facets["parent"] = series.at[i, "parent_id"]
  facets["subba"] = series.at[i, "subba_id"]
  print(facets)
  temp = api.eia_backfill(api_key = eia_api_key, 
        api_path = api_path+ "data", 
        facets = facets, 
        start = start,
        end = end,
        offset = offset) 
  ts_obj = pd.DataFrame(np.arange(start = start, stop = end + datetime.timedelta(hours = 1), step = datetime.timedelta(hours = 1)).astype(datetime.datetime), columns=["index"])
  ts_obj  = ts_obj.merge(temp.data, left_on = "index", right_on = "period", how="left")
  ts_obj.drop("period", axis = 1, inplace= True)
  ts_obj = ts_obj.rename(columns= {"index": "period"})

  meta_temp = eia_data.create_metadata(data = ts_obj, start = start, end = end, type = "backfill")
  meta_temp["index"] = 1
  meta_df = pd.DataFrame([meta_temp])

  if i == series.index.start:
    data = ts_obj
    meta = meta_df
  else:
    data = data._append(ts_obj)
    meta = meta._append(meta_df)
{'parent': 'CISO', 'subba': 'PGAE'}
{'parent': 'CISO', 'subba': 'SCE'}
{'parent': 'CISO', 'subba': 'SDGE'}
{'parent': 'CISO', 'subba': 'VEA'}
Code
print(meta)
# The initial pull has some missing values

data.head()
   index parent subba                             time               start  \
0      1   CISO  PGAE 2024-06-22 19:17:11.504732+00:00 2018-07-01 08:00:00   
0      1   CISO   SCE 2024-06-22 19:17:30.010912+00:00 2018-07-01 08:00:00   
0      1   CISO  SDGE 2024-06-22 19:17:46.075978+00:00 2018-07-01 08:00:00   
0      1   CISO   VEA 2024-06-22 19:18:03.131569+00:00 2018-07-01 08:00:00   

                  end           start_act             end_act  start_match  \
0 2024-06-15 01:00:00 2018-07-01 08:00:00 2024-06-15 01:00:00         True   
0 2024-06-15 01:00:00 2018-07-01 08:00:00 2024-06-15 01:00:00         True   
0 2024-06-15 01:00:00 2018-07-01 08:00:00 2024-06-15 01:00:00         True   
0 2024-06-15 01:00:00 2018-07-01 08:00:00 2024-06-15 01:00:00         True   

   end_match  n_obs  na      type  update  success  \
0       True  52218  98  backfill   False    False   
0       True  52218  98  backfill   False    False   
0       True  52218  98  backfill   False    False   
0       True  52218  98  backfill   False    False   

                      comments  
0  Missing values were found;   
0  Missing values were found;   
0  Missing values were found;   
0  Missing values were found;   
period subba subba-name parent parent-name value value-units
0 2018-07-01 08:00:00 PGAE Pacific Gas and Electric CISO California Independent System Operator 12522.0 megawatthours
1 2018-07-01 09:00:00 PGAE Pacific Gas and Electric CISO California Independent System Operator 11745.0 megawatthours
2 2018-07-01 10:00:00 PGAE Pacific Gas and Electric CISO California Independent System Operator 11200.0 megawatthours
3 2018-07-01 11:00:00 PGAE Pacific Gas and Electric CISO California Independent System Operator 10822.0 megawatthours
4 2018-07-01 12:00:00 PGAE Pacific Gas and Electric CISO California Independent System Operator 10644.0 megawatthours
Code
# Save the data
d = eia_data.append_data(data_path = data_path, new_data = data, init = True, save = True)
# Save the metadata
meta["success"] = True
meta["update"] = True
m = eia_data.append_metadata(meta_path = meta_path, meta = meta, save = True, init = True)
print(m)
Initial data pull
Save the data to CSV file
   index parent subba                             time               start  \
0      1   CISO  PGAE 2024-06-22 19:17:11.504732+00:00 2018-07-01 08:00:00   
0      1   CISO   SCE 2024-06-22 19:17:30.010912+00:00 2018-07-01 08:00:00   
0      1   CISO  SDGE 2024-06-22 19:17:46.075978+00:00 2018-07-01 08:00:00   
0      1   CISO   VEA 2024-06-22 19:18:03.131569+00:00 2018-07-01 08:00:00   

                  end           start_act             end_act  start_match  \
0 2024-06-15 01:00:00 2018-07-01 08:00:00 2024-06-15 01:00:00         True   
0 2024-06-15 01:00:00 2018-07-01 08:00:00 2024-06-15 01:00:00         True   
0 2024-06-15 01:00:00 2018-07-01 08:00:00 2024-06-15 01:00:00         True   
0 2024-06-15 01:00:00 2018-07-01 08:00:00 2024-06-15 01:00:00         True   

   end_match  n_obs  na      type  update  success  \
0       True  52218  98  backfill    True     True   
0       True  52218  98  backfill    True     True   
0       True  52218  98  backfill    True     True   
0       True  52218  98  backfill    True     True   

                      comments  
0  Missing values were found;   
0  Missing values were found;   
0  Missing values were found;   
0  Missing values were found;   

Plot the Series

We will use Plotly to visualize the series:

Code
d = data.sort_values(by = ["subba", "period"])

p = px.line(d, x="period", y="value", color="subba")

p.show()
Code
profile = ProfileReport(d, title="Profiling Report")
profile